{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Prepare database without text pre-processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for year in ['1962', '1967', '1968', '1973', '1978', '1981', '1993', '1997', '2017']: \n",
    "    print(year)\n",
    "    df=pd.read_pickle(dir + '/data/raw/manifestos/df_raw_'+year)\n",
    "    if year!='1997':\n",
    "        df['id_doc']='id'+df['id_doc'] \n",
    "    merge=pd.read_stata(dir + '/data/intermediate/key_augmented_'+year+'.dta')\n",
    "    df=pd.merge(df[['id_doc', 'text']], merge, how='left', on='id_doc', indicator=True)\n",
    "    assert sum(df['_merge']!='both')==0\n",
    "    del df['_merge']\n",
    "\n",
    "    cleaning=df['text'].tolist()\n",
    "\n",
    "    #convert to lower case\n",
    "    cleaning = [x.lower() for x in cleaning]\n",
    "\n",
    "    #remove special characters\n",
    "    numbers = ['\\x93', '\\x92', '\\x94', '\\x91', '\\x80']\n",
    "    for x in numbers:\n",
    "         cleaning=[y.replace(x,'') for y in cleaning] \n",
    "            \n",
    "    cleaning = [x.replace('  ', ' ') for x in cleaning]\n",
    "\n",
    "    cleaning = [x.strip() for x in cleaning]\n",
    "        \n",
    "    df=df.join(pd.DataFrame({'text_clean': cleaning}))\n",
    "    df.to_pickle(dir+\"/data/intermediate/df_min_\"+year)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Personal prounouns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for year in ['1962', '1967', '1968', '1973', '1978', '1981', '1993', '1997', '2017']: \n",
    "    print(year)\n",
    "    df=pd.read_pickle(dir + '/data/intermediate/df_min_'+year)\n",
    "    \n",
    "    #function that takes a list of names and counts the number of times theses names appear in each element of a list of documents  \n",
    "    def count_word(text, list_word):\n",
    "        count_nom=[]\n",
    "        nom=text\n",
    "        for n in list_word:\n",
    "            nom=[q.replace(n, 'NOM_FOUND') for q in nom]\n",
    "        count_nom.extend([l.split().count('NOM_FOUND') for l in nom])\n",
    "        return count_nom\n",
    "    \n",
    "    size=[len(x.split()) for x in df['text_clean'].tolist()]\n",
    "\n",
    "    ##Personal pronouns\n",
    "    test=count_word(df['text_clean'], ['je', 'j’' \"j'\"])\n",
    "    \n",
    "    df_indiv=df.join(pd.DataFrame({'nb_perso': test, 'size':size}))\n",
    "    df_indiv['sh_perso']=df_indiv['nb_perso']/df_indiv['size']\n",
    "    print(\"Mean share of perso:\")\n",
    "    print(np.mean(df_indiv['sh_perso']))\n",
    "    \n",
    "    df_indiv[['id_unique_cand', 'tour', 'nb_perso', 'sh_perso']].to_csv(dir + '/data/intermediate/df_perso_' + year + '.csv')\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Past participle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for year in ['1962', '1967', '1968', '1973', '1978', '1981', '1993', '1997', '2017']: \n",
    "    print(year)\n",
    "    df=pd.read_pickle(dir + '/data/intermediate/df_'+year)  \n",
    "    df=df[df['tour']==1].reset_index(drop=True) #first round only to speed up code  \n",
    "    df=df[df['text_clean']!=''].reset_index(drop=True)    \n",
    "    \n",
    "    count_vpp=[]\n",
    "    sh_vpp=[]\n",
    "\n",
    "    for i in range(len(df)):\n",
    "        if i%100==0:\n",
    "            print(i)       \n",
    "        \n",
    "        try:\n",
    "            test=nlp_token_class(df['text_clean'][i][:2000])\n",
    "            t=[x['entity_group'] for x in test]\n",
    "            count=t.count('VPP')\n",
    "            length=len(t)    \n",
    "        except IndexError:\n",
    "            pass\n",
    "\n",
    "        if len(df['text_clean'][i])>2000:\n",
    "            try:\n",
    "                test=nlp_token_class(df['text_clean'][i][2000:4000])\n",
    "                t=[x['entity_group'] for x in test]\n",
    "                count=count+t.count('VPP')\n",
    "                length=length+len(t) \n",
    "            except IndexError:\n",
    "                pass\n",
    "\n",
    "            if len(df['text_clean'][i])>4000:\n",
    "                try:\n",
    "                    test=nlp_token_class(df['text_clean'][i][4000:6000])\n",
    "                    t=[x['entity_group'] for x in test]\n",
    "                    count=count+t.count('VPP')\n",
    "                    length=length+len(t) \n",
    "                except IndexError:\n",
    "                    pass\n",
    "\n",
    "                if len(df['text_clean'][i])>6000:\n",
    "                    try:\n",
    "                        test=nlp_token_class(df['text_clean'][i][6000:8000])\n",
    "                        t=[x['entity_group'] for x in test]\n",
    "                        count=count+t.count('VPP')\n",
    "                        length=length+len(t) \n",
    "                    except IndexError:\n",
    "                        pass\n",
    "\n",
    "                    if len(df['text_clean'][i])>8000:\n",
    "                        try:\n",
    "                            test=nlp_token_class(df['text_clean'][i][8000:10000])\n",
    "                            t=[x['entity_group'] for x in test]\n",
    "                            count=count+t.count('VPP')\n",
    "                            length=length+len(t) \n",
    "                        except IndexError:\n",
    "                            pass\n",
    "\n",
    "                        if len(df['text_clean'][i])>10000:\n",
    "                            try:\n",
    "                                test=nlp_token_class(df['text_clean'][i][10000:12000])\n",
    "                                t=[x['entity_group'] for x in test]\n",
    "                                count=count+t.count('VPP')\n",
    "                                length=length+len(t)\n",
    "                            except IndexError:\n",
    "                                pass\n",
    "                                    \n",
    "                            if len(df['text_clean'][i])>12000:\n",
    "                                try:\n",
    "                                    test=nlp_token_class(df['text_clean'][i][12000:14000])\n",
    "                                    t=[x['entity_group'] for x in test]\n",
    "                                    count=count+t.count('VPP')\n",
    "                                    length=length+len(t) \n",
    "                                except IndexError:\n",
    "                                    pass\n",
    "                                \n",
    "                                if len(df['text_clean'][i])>14000:\n",
    "                                    try:\n",
    "                                        test=nlp_token_class(df['text_clean'][i][14000:16000])\n",
    "                                        t=[x['entity_group'] for x in test]\n",
    "                                        count=count+t.count('VPP')\n",
    "                                        length=length+len(t) \n",
    "                                    except IndexError:\n",
    "                                        pass\n",
    " \n",
    "                                    if len(df['text_clean'][i])>16000:\n",
    "                                        try:\n",
    "                                            test=nlp_token_class(df['text_clean'][i][16000:18000])\n",
    "                                            t=[x['entity_group'] for x in test]\n",
    "                                            count=count+t.count('VPP')\n",
    "                                            length=length+len(t) \n",
    "                                        except IndexError:\n",
    "                                            pass\n",
    "                                        \n",
    "                                        if len(df['text_clean'][i])>18000:\n",
    "                                            try:\n",
    "                                                test=nlp_token_class(df['text_clean'][i][18000:20000])\n",
    "                                                t=[x['entity_group'] for x in test]\n",
    "                                                count=count+t.count('VPP')\n",
    "                                                length=length+len(t) \n",
    "                                            except IndexError:\n",
    "                                                pass\n",
    "                                        \n",
    "                                            if len(df['text_clean'][i])>20000:\n",
    "                                                try:\n",
    "                                                    test=nlp_token_class(df['text_clean'][i][20000:22000])\n",
    "                                                    t=[x['entity_group'] for x in test]\n",
    "                                                    count=count+t.count('VPP')\n",
    "                                                    length=length+len(t) \n",
    "                                                except IndexError:\n",
    "                                                    pass\n",
    "        sh=count/length\n",
    "        count_vpp.append(count)\n",
    "        sh_vpp.append(sh)\n",
    "        \n",
    "    df_vpp=df.join(pd.DataFrame({'nb_vpp': count_vpp, 'sh_vpp': sh_vpp}))\n",
    "    df_vpp[['id_unique_cand', 'tour', 'nb_vpp', 'sh_vpp']].to_csv(dir + '/data/intermediate/df_vpp_' + year + '.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
